In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from os import environ
from citrination_client import CitrinationClient
from citrination_client import *
from pypif import pif
from pypif.obj import *
import csv
import pandas as pd
%matplotlib inline

Getting Flexural Strength CSV

In [ ]:
client = CitrinationClient(environ['CITRINATION_API_KEY'], 'https://citrination.com')
dataset_id = '151803'
value_query = FieldQuery(extract_as="Flexural Strength", extract_all=True)
property_query = PropertyQuery(name=FieldQuery(filter=[Filter(equal="Flexural Strength")]), value=value_query)
formula_query = ChemicalFieldQuery(extract_as="formula")
system_query = PifSystemQuery(chemical_formula=formula_query, properties=property_query)
dataset_query = DatasetQuery(id=[Filter(equal=dataset_id)])
data_query = DataQuery(dataset=dataset_query, system=system_query)
pif_query = PifSystemReturningQuery(size=10000, random_results=False, query=data_query)
search_result = client.search.pif_search(pif_query)

print("We found {} records".format(len(search_result.hits)))
print([x.extracted for x in search_result.hits[0:5]])
In [ ]:
rows = []
pif_records = [x.system for x in search_result.hits]
for system in pif_records:
    
    cryst_value= None
    for prop in system.properties:
        if prop.name == 'Crystallinity':
            cryst_value= prop.scalars[0].value
    for prop in system.properties:
        if prop.name == "Flexural Strength" and prop.units == "MPa":
            for cond in prop.conditions:
                if cond.name == "Temperature":
                    if cond.units=="$^{\\circ}$C":
                        add= 273.
                    elif cond.units=='K':
                        add= 0.
                    if len(prop.scalars) == len(cond.scalars):
                        for prop_sca, cond_sca in zip(prop.scalars, cond.scalars):
                            row = [system.chemical_formula, prop_sca.value, (float(cond_sca.value)+add), 
                                    cryst_value, system.references[0].citation]
                            rows.append(row)

with open('flexural_strength_temperature.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Formula', 'Flexural Strength (MPa)', 'Temperature (K)', 'Crystallinity','Reference'])
    writer.writerows(rows)
In [ ]:
#this code is to only use the entries with no "x" or "." in the chemical formula


# rows = []
# pif_records = [x.system for x in search_result.hits]
# for system in pif_records:
#     if "x" not in system.chemical_formula and "." not in system.chemical_formula:
#         cryst_value= None
#         for prop in system.properties:
#             if prop.name == 'Crystallinity':
#                 cryst_value= prop.scalars[0].value
#         for prop in system.properties:
#             if prop.name == "Flexural Strength" and prop.units == "MPa":
#                 for cond in prop.conditions:
#                     if cond.name == "Temperature":
#                         if cond.units=="$^{\\circ}$C":
#                             add= 273.
#                         elif cond.units=='K':
#                             add= 0.
#                         if len(prop.scalars) == len(cond.scalars):
#                             for prop_sca, cond_sca in zip(prop.scalars, cond.scalars):
#                                 row = [system.chemical_formula, prop_sca.value, (float(cond_sca.value)+add), 
#                                        cryst_value, system.references[0].citation]
#                                 rows.append(row)

# with open('flexural_strength_temperature.csv', 'w') as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerow(['Formula', 'Flexural Strength (MPa)', 'Temperature (K)', 'Crystallinity','Reference'])
#     writer.writerows(rows)

Clean up the raw CSV and generate composition

In [28]:
import re
from matminer.utils.conversions import str_to_composition
In [29]:
fs= pd.read_csv('flexural_strength_temperature.csv', encoding = "ISO-8859-1")
fs.head()
Out[29]:
Formula Flexural Strength (MPa) Temperature (K) Crystallinity Reference
0 Si3N4.xY2O3.ySrO 707 298.0 Polycrystalline Ceramic Technology Project Data Base: Septemb...
1 Si3N4.xY2O3.ySrO 609 1373.0 Polycrystalline Ceramic Technology Project Data Base: Septemb...
2 Si3N4.xY2O3.ySrO 649 1373.0 Polycrystalline Ceramic Technology Project Data Base: Septemb...
3 Si3N4.xY2O3.ySrO 675 1373.0 Polycrystalline Ceramic Technology Project Data Base: Septemb...
4 Si3N4.xY2O3.ySrO 697 1373.0 Polycrystalline Ceramic Technology Project Data Base: Septemb...
In [30]:
len(fs['Reference'].unique())
Out[30]:
117
In [31]:
print(fs.shape)
# Determine number of polycrystalline / single crystal samples
N_polyX = fs[fs['Crystallinity']=='Polycrystalline']['Crystallinity'].shape
N_singleX = fs[fs['Crystallinity']=='Single Crystal']['Crystallinity'].shape
print('Polycrystalline: {0}, Single crystal: {1}'.format(N_polyX, N_singleX))
(1934, 5)
Polycrystalline: (1776,), Single crystal: (19,)
In [32]:
#Fill NaN values in crystallinity with polycrystalline:
fs['Crystallinity'] = fs['Crystallinity'].fillna('Unknown')
N_polyX = fs[fs['Crystallinity']=='Polycrystalline']['Crystallinity'].shape
N_singleX = fs[fs['Crystallinity']=='Single Crystal']['Crystallinity'].shape
N_unknownX = fs[fs['Crystallinity']=='Unknown']['Crystallinity'].shape
print('Polycrystalline: {0}, Single crystal: {1}, Unknown: {2}'.format(N_polyX, N_singleX, N_unknownX))
Polycrystalline: (1776,), Single crystal: (19,), Unknown: (139,)
In [33]:
# Check how many values cannot simply be transformed from string to int
#also locate which row the errors occur in

N_errors, N_total, row = 0, 0, -1
for entry in fs['Flexural Strength (MPa)']:
    row+=1
    try:
        pd.Series([entry]).astype(float)
    except:
        N_errors +=1
        print(entry)
        print(row)
        
    finally:
        N_total +=1

print('{0} errors in {1} samples'.format(N_errors, N_total))
359 (15%)
1016
1 errors in 1934 samples
In [34]:
fs.loc[1016, 'Reference']
Out[34]:
'Material Properties of a Sintered $\\alpha$-SiC, R.G. Munro, Journal of Physical and Chemical Reference Data, Vol. 26 5, pp. 1195-1203 (1997), published by American Chemical Society.'
In [35]:
fs.loc[fs['Reference']=='Material Properties of a Sintered $\\alpha$-SiC, R.G. Munro, Journal of Physical and Chemical Reference Data, Vol. 26 5, pp. 1195-1203 (1997), published by American Chemical Society.']
Out[35]:
Formula Flexural Strength (MPa) Temperature (K) Crystallinity Reference
1016 SiC 359 (15%) 293.0 Polycrystalline Material Properties of a Sintered $\alpha$-SiC...
1017 SiC 359 773.0 Polycrystalline Material Properties of a Sintered $\alpha$-SiC...
1018 SiC 397 1273.0 Polycrystalline Material Properties of a Sintered $\alpha$-SiC...
1019 SiC 437 1473.0 Polycrystalline Material Properties of a Sintered $\alpha$-SiC...
1020 SiC 446 1673.0 Polycrystalline Material Properties of a Sintered $\alpha$-SiC...
1021 SiC 446 1773.0 Polycrystalline Material Properties of a Sintered $\alpha$-SiC...
In [36]:
fs.at[1016, 'Flexural Strength (MPa)'] = 359
fs.loc[1016]
Out[36]:
Formula                                                                  SiC
Flexural Strength (MPa)                                                  359
Temperature (K)                                                          293
Crystallinity                                                Polycrystalline
Reference                  Material Properties of a Sintered $\alpha$-SiC...
Name: 1016, dtype: object
In [37]:
fs['Flexural Strength (MPa)'] = fs['Flexural Strength (MPa)'].astype(float)
# fs.sort_values(by='Flexural Strength (MPa)')
In [38]:
N_unique_entries = len(fs['Formula'].unique())
print('There are just {0} unique materials in the {1} entries.'.format(N_unique_entries,fs.shape[0]))
There are just 73 unique materials in the 1934 entries.
In [39]:
#look at the disribution of the data 
fsn = fs['Flexural Strength (MPa)']
fsn = pd.to_numeric(fsn, errors = 'coerce')
fsn.hist(bins=100)
plt.xlabel('Flexural Strength')
plt.ylabel('# of samples')
plt.show()
In [40]:
fs.loc[1673, 'Reference']
Out[40]:
'Strength of Diamond, D.J. Weidner, Y. Wang, and M.T. Vaughan, Science, Vol. 266, pp. 419-421 (1994), published by American Association for the Advancement of Science.'
In [41]:
#remove the outliers from the dataset in case they give the machine learning models toruble
fs = fs[fs.Reference != 'Strength of Diamond, D.J. Weidner, Y. Wang, and M.T. Vaughan, Science, Vol. 266, pp. 419-421 (1994), published by American Association for the Advancement of Science.']
In [42]:
fsn = fs['Flexural Strength (MPa)']
fsn = pd.to_numeric(fsn, errors = 'coerce')
fsn.hist(bins=50)
plt.xlabel('Flexural Strength')
plt.ylabel('# of samples')
plt.show()
In [43]:
fs.shape
Out[43]:
(1930, 5)
In [ ]:
# Parse the chemicalFormula
def formula_decompose(formula):
    '''
    decompose chemical formula 
    return
        composition: list, [(element,num),...]
            element: string
            num: string, can be math expression such as '1+0.5x'
    '''

    comp = []
    p = re.compile(r'(\d?[w-z]?)([A-Z][a-u]?)(\d*\+?\-?\d*\.?\d*[w-z]?)')

    #split the chemical formula if there is dots, but not for cases like Mg1.5x
    if re.search(r'\.', formula) and not re.search(r'\d+\.\d[w-z]', formula): 
        formula = formula.split('.')
        for item in formula:
            prefactor = '1'
            for i in re.findall(p, item):
                pre, elem, num = i
                if pre:
                    prefactor = pre
                if num == '':
                    num = '1'
                num = prefactor + '*({})'.format(num)
                comp.append((elem, num))
    else:
        prefactor = '1'
        for i in re.findall(p, formula):
            pre, elem, num = i
            if pre:
                prefactor = pre
            if num == '':
                num = '1'
            num = prefactor + '*({})'.format(num)
            comp.append((elem, num))
    return comp 

def formula_reconstruct(composition, x=0.1, y=0.1, z=0.1, w=0.1):
    '''
    reconstruct chemical formula from composition
    composition in form of [(element,num), (element,num),...]
        element: string
        num: string, can be math expression such as '1+0.5x'

    return 
        flat chemcial formula: string, such as 'Ti1.5Cu0.1Au1.0'
    '''
    flat_list = []
    for (elem, num) in composition:
        num = re.sub(r'(\d)([w-z])', r'\1*\2', num) #convert 5x to 5*x
        flat_list.append(elem)
        flat_list.append(format(eval(num), '.1f'))
    return ''.join(flat_list)
  
def formula_parser(formula):
    return formula_reconstruct(formula_decompose(formula))
In [ ]:
fs1 = fs.copy()
fs1["flatFormula"] = fs1["Formula"].map(formula_parser)
fs1.head()
#fs1.dropna(axis=1).head()
In [ ]:
fs1["composition"] =fs1["flatFormula"].transform(str_to_composition)
fs1.head()
In [ ]:
print(len(fs1['composition'].unique()))
print(len(fs1['Formula'].unique()))

Featurize the Dataframe

In [ ]:
from matminer.featurizers import composition
from matminer.featurizers.composition import ElementFraction
In [ ]:
# select featurizers

# elemental property
ep_feat = composition.ElementProperty.from_preset(preset_name="magpie")

# atomic packing efficiency
ape_feat= composition.AtomicPackingEfficiency()

# # atomic orbitals
# ao_feat = composition.AtomicOrbitals()

# stoichiometry
s_feat  = composition.Stoichiometry()

# # t metal fraction
# tmf_feat  = composition.TMetalFraction()

# # valence orbital
# vo_feat  = composition.ValenceOrbital()
In [ ]:
fsg=fs1.drop_duplicates(['Formula','Temperature (K)','Crystallinity'])
# print('Shape of dataset with no duplicates: %s'%str(fsg.shape))
# print('Number of unique chemical Formulae in the dataset: %i'%(fsg['Formula'].nunique()))
fsg.head()
In [ ]:
fsg= ep_feat.featurize_dataframe(fsg,col_id='composition',ignore_errors=True)
fsg= ape_feat.featurize_dataframe(fsg,col_id='composition',ignore_errors=True)
#fsg = ao_feat.featurize_dataframe(fsg, col_id="composition", ignore_errors=True)  
fsg = s_feat.featurize_dataframe(fsg, col_id="composition", ignore_errors=True) 
#fsg = tmf_feat.featurize_dataframe(fsg, col_id="composition", ignore_errors=True) 
#fsg = vo_feat.featurize_dataframe(fsg, col_id="composition", ignore_errors=True) 
In [ ]:
fsg.to_pickle('flexural_strength_temperature_with_reduced_features.pkl')
print('Shape of dataset with Features: %s'%str(fsg.shape))

Examine and Clean Up the Featurized Dataframe

In [2]:
#fsg= pd.read_pickle('flexural_strength_temperature_with_features.pkl')
fsg= pd.read_pickle('flexural_strength_temperature_with_reduced_features.pkl')
In [3]:
#get rid of feature columns that have NaN 

print(fsg.isnull().values.any())
fsgd = fsg.dropna(axis=1)
print(fsgd.isnull().values.any())
print(fsgd.shape)
True
False
(576, 145)
In [4]:
#examine how the flexural strength depends on temp in the dataset for each material

formulae= fsgd.Formula.unique()
cfunc= plt.cm.get_cmap('hsv',len(formulae))
colors= [cfunc(i) for i in range(len(formulae))]
colors= dict(zip(formulae,colors))
symb= ['o','v','^','*','s']


plt.figure(figsize= (10,10))
for i,item in enumerate(formulae):
    item_df= fsgd[fsgd.Formula==item]
    plt.plot(item_df['Temperature (K)'].values,np.array(item_df['Flexural Strength (MPa)'],dtype= 'float'), 
             symb[i % len(symb)],color= colors[item],label= item, markersize= 5)
plt.title('Scatter Plot of Property (Flexural Strength) and Prop Condition (Temperature)')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
plt.close()
In [5]:
# convert crystallinity to a numerical value
dict_map= {'Single Crystal':1, 'Unknown': 0, 'Polycrystalline': -1}
fsgd['Crystallinity']= fsgd['Crystallinity'].apply(lambda x: dict_map[x]) 
C:\Users\Steven\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
In [6]:
fsgd_des= fsgd.describe()
fsgd_des= fsgd_des.astype('float')
fsgd_des
Out[6]:
Flexural Strength (MPa) Temperature (K) Crystallinity minimum Number maximum Number range Number mean Number avg_dev Number mode Number minimum MendeleevNumber ... range SpaceGroupNumber mean SpaceGroupNumber avg_dev SpaceGroupNumber mode SpaceGroupNumber 0-norm 2-norm 3-norm 5-norm 7-norm 10-norm
count 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 ... 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000
mean 424.397569 954.411806 -0.880208 7.112847 28.852431 21.739583 12.942724 7.113185 7.263889 47.756944 ... 149.642361 137.573012 57.261183 94.954861 3.090278 0.700396 0.640817 0.608047 0.598646 0.593515
std 264.374451 506.711933 0.379323 1.031390 15.665551 15.325168 4.464029 5.643810 0.959972 26.471641 ... 77.241714 58.105931 35.919277 88.969666 1.007196 0.042316 0.052467 0.064027 0.069877 0.074134
min 14.000000 4.200000 -1.000000 4.000000 6.000000 1.000000 5.200000 0.320000 4.000000 8.000000 ... 28.000000 72.666667 8.960000 12.000000 2.000000 0.520683 0.435622 0.385855 0.368762 0.357366
25% 240.000000 319.750000 -1.000000 6.000000 14.000000 7.000000 10.000000 3.007476 7.000000 12.000000 ... 33.000000 74.562500 16.500000 12.000000 2.000000 0.684323 0.622410 0.574349 0.552045 0.535887
50% 355.000000 1033.500000 -1.000000 7.000000 39.000000 32.000000 10.547945 4.016356 8.000000 44.000000 ... 182.000000 103.000000 80.888889 12.000000 3.000000 0.710365 0.642563 0.596290 0.581749 0.577916
75% 572.250000 1423.000000 -1.000000 8.000000 40.000000 32.000000 18.125000 13.671875 8.000000 73.000000 ... 213.500000 203.027397 84.072022 194.000000 4.000000 0.721110 0.674811 0.659173 0.656769 0.656289
max 1265.000000 2273.000000 1.000000 8.000000 74.000000 67.000000 27.333333 29.777778 8.000000 78.000000 ... 217.000000 210.500000 103.335799 194.000000 7.000000 0.824621 0.804145 0.800156 0.800007 0.800000

8 rows × 141 columns

In [7]:
#examining the std, we see that some of the columns have 0 std which is bad for ML. we will get rid of these later
std= np.array(fsgd_des.loc['std'],dtype= 'float')
std
Out[7]:
array([2.64374451e+02, 5.06711933e+02, 3.79322909e-01, 1.03138989e+00,
       1.56655511e+01, 1.53251676e+01, 4.46402886e+00, 5.64381008e+00,
       9.59971819e-01, 2.64716410e+01, 4.03253039e+00, 2.82495389e+01,
       5.25119914e+00, 8.03171464e+00, 5.75701106e+00, 1.89182347e+00,
       3.88544154e+01, 3.82142885e+01, 1.07660862e+01, 1.38151042e+01,
       1.74865998e+00, 6.34703056e+02, 8.90241180e+02, 7.97550430e+02,
       7.04718821e+02, 2.30202667e+02, 7.73727289e+02, 5.16240474e+00,
       8.36024640e-01, 5.54786702e+00, 1.59300745e+00, 2.30153929e+00,
       2.62265711e+00, 0.00000000e+00, 1.11718974e+00, 1.11718974e+00,
       3.01863427e-01, 4.19264960e-01, 0.00000000e+00, 4.56461626e+00,
       3.50535737e+01, 3.70142354e+01, 8.26045517e+00, 1.36877269e+01,
       5.10089388e+00, 2.78335833e-01, 3.76079830e-01, 5.69880499e-01,
       2.27944583e-01, 2.47758494e-01, 6.79244159e-01, 1.09662966e-01,
       0.00000000e+00, 1.09662966e-01, 1.89517446e-03, 3.71687490e-03,
       0.00000000e+00, 7.77677702e-01, 8.36024640e-01, 1.34529059e+00,
       5.05384381e-01, 6.35376358e-01, 1.18375554e+00, 0.00000000e+00,
       1.06575907e+00, 1.06575907e+00, 2.94270111e-01, 3.90656492e-01,
       0.00000000e+00, 0.00000000e+00, 1.92187933e+00, 1.92187933e+00,
       4.74422892e-01, 6.33152015e-01, 0.00000000e+00, 6.55852048e-01,
       1.92074550e+00, 2.07850673e+00, 6.54238787e-01, 7.38745677e-01,
       1.18375554e+00, 0.00000000e+00, 1.09662966e-01, 1.09662966e-01,
       1.89517446e-03, 3.71687490e-03, 0.00000000e+00, 1.46906507e+00,
       1.28469131e+00, 1.50021637e+00, 1.09618653e+00, 4.59321816e-01,
       1.02235874e+00, 0.00000000e+00, 4.24776014e+00, 4.24776014e+00,
       1.27111076e+00, 1.66390499e+00, 0.00000000e+00, 0.00000000e+00,
       2.43867725e+00, 2.43867725e+00, 5.92229775e-02, 1.15311933e-01,
       0.00000000e+00, 1.30000000e+00, 3.57128874e+00, 3.96774701e+00,
       8.07573566e-01, 9.76961766e-01, 1.02235874e+00, 2.53960227e+00,
       7.46399321e+00, 7.92700542e+00, 2.40490332e+00, 2.15707758e+00,
       3.08703115e+00, 3.35003630e-01, 2.90944865e+00, 2.80061905e+00,
       1.65796730e+00, 1.29195059e+00, 2.69343860e+00, 0.00000000e+00,
       5.01114796e-06, 5.01114796e-06, 1.07763534e-06, 1.44479670e-06,
       0.00000000e+00, 8.03156659e+01, 1.58262818e+01, 7.72417135e+01,
       5.81059311e+01, 3.59192770e+01, 8.89696658e+01, 1.00719633e+00,
       4.23159759e-02, 5.24665357e-02, 6.40268543e-02, 6.98765118e-02,
       7.41344454e-02])
In [8]:
#set X and y for ML.

y = fsgd['Flexural Strength (MPa)'].astype('float')
print(y.shape)

excluded = ['Formula', 'Crystallinity', 'Reference', 'flatFormula', 'composition', 'Flexural Strength (MPa)']
X = fsgd.drop(excluded, axis=1).astype('float')
print(X.shape)

print(y.describe())

dat_des= X.describe()
dat_des
(576,)
(576, 139)
count     576.000000
mean      424.397569
std       264.374451
min        14.000000
25%       240.000000
50%       355.000000
75%       572.250000
max      1265.000000
Name: Flexural Strength (MPa), dtype: float64
Out[8]:
Temperature (K) minimum Number maximum Number range Number mean Number avg_dev Number mode Number minimum MendeleevNumber maximum MendeleevNumber range MendeleevNumber ... range SpaceGroupNumber mean SpaceGroupNumber avg_dev SpaceGroupNumber mode SpaceGroupNumber 0-norm 2-norm 3-norm 5-norm 7-norm 10-norm
count 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 ... 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000 576.000000
mean 954.411806 7.112847 28.852431 21.739583 12.942724 7.113185 7.263889 47.756944 84.654514 36.897569 ... 149.642361 137.573012 57.261183 94.954861 3.090278 0.700396 0.640817 0.608047 0.598646 0.593515
std 506.711933 1.031390 15.665551 15.325168 4.464029 5.643810 0.959972 26.471641 4.032530 28.249539 ... 77.241714 58.105931 35.919277 88.969666 1.007196 0.042316 0.052467 0.064027 0.069877 0.074134
min 4.200000 4.000000 6.000000 1.000000 5.200000 0.320000 4.000000 8.000000 72.000000 1.000000 ... 28.000000 72.666667 8.960000 12.000000 2.000000 0.520683 0.435622 0.385855 0.368762 0.357366
25% 319.750000 6.000000 14.000000 7.000000 10.000000 3.007476 7.000000 12.000000 82.000000 9.000000 ... 33.000000 74.562500 16.500000 12.000000 2.000000 0.684323 0.622410 0.574349 0.552045 0.535887
50% 1033.500000 7.000000 39.000000 32.000000 10.547945 4.016356 8.000000 44.000000 87.000000 35.000000 ... 182.000000 103.000000 80.888889 12.000000 3.000000 0.710365 0.642563 0.596290 0.581749 0.577916
75% 1423.000000 8.000000 40.000000 32.000000 18.125000 13.671875 8.000000 73.000000 87.000000 75.000000 ... 213.500000 203.027397 84.072022 194.000000 4.000000 0.721110 0.674811 0.659173 0.656769 0.656289
max 2273.000000 8.000000 74.000000 67.000000 27.333333 29.777778 8.000000 78.000000 87.000000 79.000000 ... 217.000000 210.500000 103.335799 194.000000 7.000000 0.824621 0.804145 0.800156 0.800007 0.800000

8 rows × 139 columns

In [9]:
# drop cols where 75%-25% < 0.5*std
drop_cols= dat_des.columns[abs(dat_des.loc['25%']-dat_des.loc['75%'])<=(0.5*dat_des.loc['std'])]
print('Number of columns to be dropped: %i'%len(drop_cols))
X= X.drop(drop_cols,axis= 1)
dat_des= dat_des.drop(drop_cols,axis= 1)
print('Shape of the training dataset: %s'%str(X.shape))
Number of columns to be dropped: 43
Shape of the training dataset: (576, 96)

Try some ML algorithms (linear regression and random forest)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn import preprocessing
import numpy as np
In [11]:
lr = LinearRegression()
lr.fit(X, y)

# get fit statistics
print('training R2 = ' + str(round(lr.score(X, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(X))))
training R2 = 0.627
training RMSE = 161.255
In [12]:
# Use 10-fold cross validation (90% training, 10% test)
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(lr, X, y, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
Cross-validation results:
Folds: 10, mean R2: 448.447
Folds: 10, mean RMSE: 3100.608
In [13]:
from matminer.figrecipes.plot import PlotlyFig
from sklearn.model_selection import cross_val_predict

pf = PlotlyFig(x_title='Flexural Strength (MPa)',
               y_title='Predicted Flexural Strength (MPa)',
               title='Linear regression',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(lr, X, y, cv=crossvalidation)), ([0, 12], [0, 12])], 
      labels=fsgd['Formula'], 
      modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )
In [14]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1)

rf.fit(X, y)
print('training R2 = ' + str(round(rf.score(X, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=rf.predict(X))))
C:\Users\Steven\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning:

numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.

training R2 = 0.912
training RMSE = 78.476
In [15]:
# compute cross validation scores for random forest model
r2_scores = cross_val_score(rf, X, y, scoring='r2', cv=crossvalidation, n_jobs=-1)
scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=-1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
Cross-validation results:
Folds: 10, mean R2: 0.489
Folds: 10, mean RMSE: 184.511
In [16]:
from matminer.figrecipes.plot import PlotlyFig

pf_rf = PlotlyFig(x_title='Fracture Toughness (Mpa m^(1/2))',
                  y_title='Random forest Fracture Toughness (Mpa m^(1/2))',
                  title='Random forest regression',
                  mode='notebook',
                  filename="rf_regression.html")

pf_rf.xy([(y, cross_val_predict(rf, X, y, cv=crossvalidation)), ([0, 12], [0, 12])], 
      labels=fsgd['Formula'], modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], showlegends=False)
In [17]:
from sklearn.model_selection import train_test_split
X['formula'] = fsgd['Formula']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
train_formula = X_train['formula']
X_train = X_train.drop('formula', axis=1)
test_formula = X_test['formula']
X_test = X_test.drop('formula', axis=1)

rf_reg = RandomForestRegressor(n_estimators=50, random_state=1)
rf_reg.fit(X_train, y_train)

# get fit statistics
print('training R2 = ' + str(round(rf_reg.score(X_train, y_train), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_train, y_pred=rf_reg.predict(X_train))))
print('test R2 = ' + str(round(rf_reg.score(X_test, y_test), 3)))
print('test RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y_test, y_pred=rf_reg.predict(X_test))))
training R2 = 0.909
training RMSE = 79.856
test R2 = 0.49
test RMSE = 187.372
In [18]:
importances = rf.feature_importances_
# included = np.asarray(included)
included = X.columns.values
indices = np.argsort(importances)[::-1]

pf = PlotlyFig(y_title='Importance (%)',
               title='Feature by importances',
               mode='notebook',
               fontsize=20,
               ticksize=15)

pf.bar(x=included[indices][0:10], y=importances[indices][0:10])
In [ ]:
 
In [19]:
print(included[indices][0:10])
['Temperature (K)' 'avg_dev NpUnfilled' 'avg_dev GSvolume_pa'
 'avg_dev SpaceGroupNumber' 'range MeltingT' 'avg_dev MeltingT'
 'mean GSvolume_pa' 'mean NUnfilled' 'range GSvolume_pa' 'mean NpValence']
In [20]:
# Xr = X[['Temperature (K)', 'avg_dev NpUnfilled', 'avg_dev GSvolume_pa',
#  'avg_dev SpaceGroupNumber', 'range MeltingT', 'avg_dev MeltingT',
#  'mean GSvolume_pa', 'mean NUnfilled', 'range GSvolume_pa', 'mean NpValence']].copy()
Xr = X[['Temperature (K)', 'avg_dev NpUnfilled', 'avg_dev GSvolume_pa',
 'avg_dev SpaceGroupNumber', 'range MeltingT', 'avg_dev MeltingT', 'mean GSvolume_pa', 'mean NUnfilled' ]].copy()
#Xr = preprocessing.scale(Xr)
Xr
Out[20]:
Temperature (K) avg_dev NpUnfilled avg_dev GSvolume_pa avg_dev SpaceGroupNumber range MeltingT avg_dev MeltingT mean GSvolume_pa mean NUnfilled
0 298.0 0.607185 3.885455 23.455557 1744.20 789.641558 17.653636 3.454545
1 1373.0 0.607185 3.885455 23.455557 1744.20 789.641558 17.653636 3.454545
5 1473.0 0.607185 3.885455 23.455557 1744.20 789.641558 17.653636 3.454545
7 1573.0 0.607185 3.885455 23.455557 1744.20 789.641558 17.653636 3.454545
10 298.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
12 1323.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
13 298.0 1.438234 3.681652 101.206123 2073.20 447.626344 12.153868 3.245283
14 523.0 1.438234 3.681652 101.206123 2073.20 447.626344 12.153868 3.245283
15 1073.0 1.438234 3.681652 101.206123 2073.20 447.626344 12.153868 3.245283
16 1323.0 1.438234 3.681652 101.206123 2073.20 447.626344 12.153868 3.245283
27 295.0 0.268800 6.907840 19.424000 2136.00 959.552000 11.805200 4.160000
28 298.0 0.630000 3.385550 27.225000 1744.20 775.787512 16.953375 3.500000
29 1273.0 0.630000 3.385550 27.225000 1744.20 775.787512 16.953375 3.500000
30 295.0 0.661344 3.703070 34.146030 1744.20 767.962584 17.004548 3.530120
31 295.0 0.090909 6.940857 16.409091 3759.95 1157.416529 13.274943 4.000000
32 296.0 1.212457 4.913705 94.541730 2073.20 613.168554 13.045235 3.576471
35 873.0 1.212457 4.913705 94.541730 2073.20 613.168554 13.045235 3.576471
36 1273.0 1.212457 4.913705 94.541730 2073.20 613.168554 13.045235 3.576471
37 1473.0 1.212457 4.913705 94.541730 2073.20 613.168554 13.045235 3.576471
38 1573.0 1.212457 4.913705 94.541730 2073.20 613.168554 13.045235 3.576471
44 293.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
45 1273.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
46 1473.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
47 1673.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
50 1573.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
58 77.0 0.888889 6.829697 80.888889 2073.20 878.715152 14.227273 4.424242
59 203.0 0.888889 6.829697 80.888889 2073.20 878.715152 14.227273 4.424242
60 298.0 0.888889 6.829697 80.888889 2073.20 878.715152 14.227273 4.424242
64 298.0 0.914127 6.363747 84.072022 2073.20 813.903740 13.941447 4.263158
65 523.0 0.914127 6.363747 84.072022 2073.20 813.903740 13.941447 4.263158
... ... ... ... ... ... ... ... ...
1825 771.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1826 826.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1828 916.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1829 1025.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1830 1178.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1831 287.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1832 472.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1834 869.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1835 968.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1836 1074.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1837 1181.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1841 769.0 0.902344 6.344502 82.113281 2073.20 885.945703 13.938906 3.812500
1847 1077.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
1848 1541.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
1850 1536.0 0.489796 2.777755 16.163265 1623.95 795.404082 17.199286 3.428571
1866 295.0 0.576000 3.262261 21.664000 1632.20 773.691449 17.218600 3.573333
1867 295.0 1.440000 3.540000 102.240000 878.67 421.761600 12.055000 3.200000
1868 295.0 0.000000 7.400000 16.500000 2136.00 1068.000000 13.040000 4.000000
1873 1473.0 0.000000 7.400000 16.500000 2136.00 1068.000000 13.040000 4.000000
1874 1673.0 0.000000 7.400000 16.500000 2136.00 1068.000000 13.040000 4.000000
1890 296.0 0.573156 3.217554 19.722667 3768.20 826.967893 17.089733 3.706667
1892 573.0 0.901224 7.037812 82.011429 2073.20 909.500735 14.459857 4.114286
1893 753.0 0.901224 7.037812 82.011429 2073.20 909.500735 14.459857 4.114286
1894 1053.0 0.901224 7.037812 82.011429 2073.20 909.500735 14.459857 4.114286
1899 1073.0 0.901224 7.037812 82.011429 2073.20 909.500735 14.459857 4.114286
1907 1573.0 0.640000 3.628400 63.520000 1632.20 780.816000 15.904500 3.200000
1921 296.0 2.222222 7.121111 12.444444 220.00 97.777778 12.513333 6.000000
1922 963.0 2.222222 7.121111 12.444444 220.00 97.777778 12.513333 6.000000
1923 1273.0 2.222222 7.121111 12.444444 220.00 97.777778 12.513333 6.000000
1924 1673.0 2.222222 7.121111 12.444444 220.00 97.777778 12.513333 6.000000

576 rows × 8 columns

In [21]:
lr = LinearRegression()
lr.fit(Xr, y)

# get fit statistics
print('training R2 = ' + str(round(lr.score(Xr, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=lr.predict(Xr))))
training R2 = 0.243
training RMSE = 229.838
In [22]:
# Use 10-fold cross validation (90% training, 10% test)
crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(lr, Xr, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]
r2_scores = cross_val_score(lr, Xr, y, scoring='r2', cv=crossvalidation, n_jobs=1)

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
Cross-validation results:
Folds: 10, mean R2: 0.202
Folds: 10, mean RMSE: 232.328
In [23]:
pf = PlotlyFig(x_title='Flexural Strength (MPa)',
               y_title='Predicted Flexural Strength (MPa)',
               title='Linear regression',
               mode='notebook',
               filename="lr_regression.html")

pf.xy(xy_pairs=[(y, cross_val_predict(lr, Xr, y, cv=crossvalidation)), ([0, 16], [0, 16])], 
      labels=fsgd['Formula'], 
      modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], 
      showlegends=False
     )
In [24]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1)

rf.fit(Xr, y)
print('training R2 = ' + str(round(rf.score(Xr, y), 3)))
print('training RMSE = %.3f' % np.sqrt(mean_squared_error(y_true=y, y_pred=rf.predict(Xr))))
training R2 = 0.908
training RMSE = 80.044
In [25]:
# compute cross validation scores for random forest model
r2_scores = cross_val_score(rf, Xr, y, scoring='r2', cv=crossvalidation, n_jobs=-1)
scores = cross_val_score(rf, Xr, y, scoring='neg_mean_squared_error', cv=crossvalidation, n_jobs=-1)
rmse_scores = [np.sqrt(abs(s)) for s in scores]

print('Cross-validation results:')
print('Folds: %i, mean R2: %.3f' % (len(scores), np.mean(np.abs(r2_scores))))
print('Folds: %i, mean RMSE: %.3f' % (len(scores), np.mean(np.abs(rmse_scores))))
Cross-validation results:
Folds: 10, mean R2: 0.473
Folds: 10, mean RMSE: 187.612
In [26]:
from matminer.figrecipes.plot import PlotlyFig

pf_rf = PlotlyFig(x_title='Fracture Toughness (Mpa m^(1/2))',
                  y_title='Random forest Fracture Toughness (Mpa m^(1/2))',
                  title='Random forest regression',
                  mode='notebook',
                  filename="rf_regression.html")

pf_rf.xy([(y, cross_val_predict(rf, Xr, y, cv=crossvalidation)), ([0, 12], [0, 12])], 
      labels=fsgd['Formula'], modes=['markers', 'lines'],
      lines=[{}, {'color': 'black', 'dash': 'dash'}], showlegends=False)
In [27]:
importances = rf.feature_importances_
# included = np.asarray(included)
included = X.columns.values
indices = np.argsort(importances)[::-1]

pf = PlotlyFig(y_title='Importance (%)',
               title='Feature by importances',
               mode='notebook',
               fontsize=20,
               ticksize=15)

pf.bar(x=included[indices][0:10], y=importances[indices][0:10])
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
# feats = {} # a dict to hold feature_name: feature_importance
# for feature, importance in zip(X.columns, rf.feature_importances_):
#     feats[feature] = importance #add the name/value pair 

# importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
# imp = importances.sort_values(by = 'Gini-importance')[-10:]
# imp.plot(kind='bar')
In [ ]: